from transformers import AutoTokenizer, AutoModel


tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\Administrator\\Desktop\\python\\study\\bigmodel\\THUDM\\chatglm3-6b", trust_remote_code=True)
# 使用GPU
model = AutoModel.from_pretrained("C:\\Users\\Administrator\\Desktop\\python\\study\\bigmodel\\THUDM\\chatglm3-6b", trust_remote_code=True, device='cuda')
# GPU有限使用量化方式加载模型-4-bit 量化
# model = AutoModel.from_pretrained("C:\\Users\\Administrator\\Desktop\\python\\study\\bigmodel\\THUDM\\chatglm3-6b", trust_remote_code=True).quantize(4).cuda()

# 使用CPU
# model = AutoModel.from_pretrained("C:\\Users\\Administrator\\Desktop\\python\\study\\bigmodel\\THUDM\\chatglm3-6b", trust_remote_code=True).float()
model = model.eval()
response, history = model.chat(tokenizer, "你好", history=[])
print(response)
response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
print(response)
